set.seed(2)
required_packages <- c("tidyverse", "magrittr", "DBI", "bigrquery", "arrow","glue", "vroom","janitor", "gt", "ggwordcloud", "readxl", "ggthemes", "hrbrthemes", "extrafont", "plotly", "scales", "stringr", "gganimate", "here", "tidytext", "sentimentr", "scales", "DT", "here", "sm", "mblm", "glue", "fs", "knitr", "rmdformats", "janitor", "urltools", "colorspace", "pdftools", "showtext", "pander", "ggridges", "spatstat", "broom")
for(i in required_packages) {
if(!require(i, character.only = T)) {
# if package is not existing, install then load the package
install.packages(i, dependencies = T)
require(i, character.only = T)
}
}
panderOptions('table.alignment.default', "left")
## quality of png's
dpi <- 750
## theme updates; please adjust to client´s website
#theme_set(ggthemes::theme_clean(base_size = 15))
theme_set(ggthemes::theme_clean(base_size = 15))
theme_update(plot.margin = margin(30, 30, 30, 30),
plot.background = element_rect(color = "white",
fill = "white"),
plot.title = element_text(size = 20,
face = "bold",
lineheight = 1.05,
hjust = .5,
margin = margin(10, 0, 25, 0)),
plot.title.position = "plot",
plot.caption = element_text(color = "grey40",
size = 9,
margin = margin(20, 0, -20, 0)),
plot.caption.position = "plot",
axis.line.x = element_line(color = "black",
size = .8),
axis.line.y = element_line(color = "black",
size = .8),
axis.title.x = element_text(size = 16,
face = "bold",
margin = margin(t = 20)),
axis.title.y = element_text(size = 16,
face = "bold",
margin = margin(r = 20)),
axis.text = element_text(size = 11,
color = "black",
face = "bold"),
axis.text.x = element_text(margin = margin(t = 10)),
axis.text.y = element_text(margin = margin(r = 10)),
axis.ticks = element_blank(),
panel.grid.major.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.major.y = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.y = element_blank(),
panel.spacing.x = unit(4, "lines"),
panel.spacing.y = unit(2, "lines"),
legend.position = "top",
legend.title = element_text(
color = "black",
size = 14,
margin = margin(5, 0, 5, 0)),
legend.text = element_text(
color = "black",
size = 11,
margin = margin(4.5, 4.5, 4.5, 4.5)),
legend.background = element_rect(fill = NA,
color = NA),
legend.key = element_rect(color = NA, fill = NA),
#legend.key.width = unit(5, "lines"),
#legend.spacing.x = unit(.05, "pt"),
#legend.spacing.y = unit(.55, "pt"),
#legend.margin = margin(0, 0, 10, 0),
strip.text = element_text(face = "bold",
margin = margin(b = 10)))
## theme settings for flipped plots
theme_flip <-
theme(panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_line(size = .6,
color = "#eaeaea"))
## theme settings for maps
theme_map <-
theme_void() +
theme(legend.direction = "horizontal",
legend.box = "horizontal",
legend.margin = margin(10, 10, 10, 10),
legend.title = element_text(size = 17,
face = "bold"),
legend.text = element_text(color = "grey33",
size = 12),
plot.margin = margin(15, 5, 15, 5),
plot.title = element_text(face = "bold",
size = 20,
hjust = .5,
margin = margin(30, 0, 10, 0)),
plot.subtitle = element_text(face = "bold",
color = "grey33",
size = 17,
hjust = .5,
margin = margin(10, 0, -30, 0)),
plot.caption = element_text(size = 14,
color = "grey33",
hjust = .97,
margin = margin(-30, 0, 0, 0)))
## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)
## main color backlinko
bl_col <- "#00d188"
bl_dark <- darken(bl_col, .3, space = "HLS")
## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")
## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", darken(bl_col, .1))
## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)df <- bind_rows(
pmap_df(list(0:99), ~read_csv(glue("../proc_data/ahref/export_keywords_100_to_500/{.x}.csv"))) %>%
clean_names() %>% mutate(cat = "100-500"),
pmap_df(list(0:99), ~read_csv(glue("../proc_data/ahref/export_keywords_500_to_1000/{.x}.csv"))) %>%
clean_names() %>% mutate(cat = "500-1000"),
pmap_df(list(0:49), ~read_csv(glue("../proc_data/ahref/export_keywords_1000_to_10000/{.x}.csv"))) %>%
clean_names() %>% mutate(cat = "1000-10000"),
pmap_df(list(0:33), ~read_csv(glue("../proc_data/ahref/export_keywords_10000_to_1000000000/{.x}.csv"))) %>%
clean_names() %>% mutate(cat = "10000+")
)The volume looks weird. It doesn’t follow the initial categories:
df %>% drop_na(volume) %>%
group_by(cat) %>%
summarise(
n = n(),
mean = mean(volume),
median = median(volume),
max = max(volume),
min = min(volume)
) %>%
arrange(median) %>%
pander()| cat | n | mean | median | max | min |
|---|---|---|---|---|---|
| 100-500 | 714519 | 291.3 | 100 | 7330000 | 10 |
| 500-1000 | 776898 | 407.7 | 200 | 5750000 | 10 |
| 1000-10000 | 402317 | 1064 | 300 | 5310000 | 10 |
| 10000+ | 272614 | 13835 | 800 | 45260000 | 10 |
Let’s look at some of the searches that had a low search volume in the initial data set:
df %>% filter(cat == "100-500", volume > 10000) %>%
select(keyword, volume) %>%
head(5) %>%
pander()| keyword | volume |
|---|---|
| new orleans | 336000 |
| beauty | 135000 |
| mine | 84000 |
| stupid | 83000 |
| newport news | 32000 |
This doesn’t looks like low volume words. And indeed they were not in the original data sets for 100-500 volume. I’m not sure how they came in.
This removes those keywords that were not in the same category in the data samples I created:
df_orig <- bind_rows(
pmap_df(list(0:99), ~read_csv(glue("../raw_data/keywords_100_to_500/{.x}.txt"), col_names = c("keyword"))) %>%
clean_names() %>% select(keyword) %>% mutate(cat = "100-500"),
pmap_df(list(0:99), ~read_csv(glue("../raw_data/keywords_500_to_1000/{.x}.txt"), col_names = c("keyword"))) %>%
clean_names() %>% mutate(cat = "500-1000"),
pmap_df(list(0:99), ~read_csv(glue("../raw_data/keywords_1000_to_10000/{.x}.txt"), col_names = c("keyword"))) %>%
clean_names() %>% mutate(cat = "1000-10000"),
pmap_df(list(0:99), ~read_csv(glue("../raw_data/keywords_10000_to_1000000000/{.x}.txt"), col_names = c("keyword"))) %>%
clean_names() %>% mutate(cat = "10000+")
)There are still ~2.5 million rows left:
| Number of rows |
|---|
| 2,494,536 |
Another issue is that of the representativeness of the samples. Keywords with less than 100 volume are not even represented. And the othe rvolumes are represented at skewed ratios. This makes a large difference, especially if we look at stats based on search instead of based on volume.
To me, it still makes most sense to look at it based on volume. But you’ve been quite clear that this is not what we want. Still, it seems wrong to me to use this unrepresentative dataset, and report things such as mean. That will fully depend on how we happened to create the samples. Since most of searches have low volume, removing those with volume below 100 makes a huge difference.
So, for analyses where this is important, I will perform them in three different ways.
Using the samples here directly
Using the samples here, but scaled so that they are representative of the original data set. (Except that < 100 volume is removed.)
Using the scaled samples, and also go by volume instead of count.
As we can see, these three approaches give quite different results.
length_keyword_files <- function(min, max){
sql <- glue("SELECT count(*) as `count`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume >= {min}
AND keyword_info_search_volume < {max}")
tb <- bq_project_query("dataforseo-bigquery", sql)
df <- bq_table_download(tb) %>% mutate(min = min, max = max)
}
scaling <- map2_df(c(0, 100, 500, 1000, 10000), c(100, 500, 1000, 10000, 1000000000), length_keyword_files) %>%
mutate(factor = count / 1000000) %>% relocate(min, max)
scaling %>% pander()| min | max | count | factor |
|---|---|---|---|
| 0 | 100 | 531523126 | 531.5 |
| 100 | 500 | 33059825 | 33.06 |
| 500 | 1000 | 5766061 | 5.766 |
| 1000 | 10000 | 8449417 | 8.449 |
| 10000 | 1e+09 | 1284194 | 1.284 |
df %<>% mutate(
difficulty_cat = case_when(
difficulty <= 10 ~ "Easy\n(0-10)",
between(difficulty, 11, 30) ~ "Medium\n(11-30)",
between(difficulty, 31, 70) ~ "Hard\n(31-70)",
between(difficulty, 71, 100) ~ "Super hard\n(71-100)"
)) %>%
mutate(difficulty_cat = factor(difficulty_cat, levels = c("Easy\n(0-10)", "Medium\n(11-30)", "Hard\n(31-70)", "Super hard\n(71-100)"))) %>%
mutate(log_volume = log10(volume))
dfs <- df %>% sample_n(20000)
rdf <- bind_rows(
df %>% filter(between(volume, 100, 500)) %>%
sample_n(scaling %>% filter(min == 100) %>% pull(factor) * 2000),
df %>% filter(between(volume, 100, 500)) %>%
sample_n(scaling %>% filter(min == 500) %>% pull(factor) * 2000),
df %>% filter(between(volume, 1000, 10000)) %>%
sample_n(scaling %>% filter(min == 1000) %>% pull(factor) * 2000),
df %>% filter(between(volume, 10000, 1000000000)) %>%
sample_n(scaling %>% filter(min == 10000) %>% pull(factor) * 2000)
)Mean and median of sample:
tribble(~Mean, ~Median,
round(mean(df$difficulty, na.rm = T), 2), median(df$difficulty, na.rm = T)) %>%
pander()| Mean | Median |
|---|---|
| 14.28 | 6 |
Mean and median of representative sample:
tribble(~Mean, ~Median,
round(mean(rdf$difficulty, na.rm = T), 2), median(rdf$difficulty, na.rm = T)) %>%
pander()| Mean | Median |
|---|---|
| 14.5 | 6 |
Mean and median of sample by volume
tribble(~Mean, ~Median,
round(weighted.mean(rdf$difficulty, rdf$volume, na.rm = T), 2), weighted.median(rdf$difficulty, rdf$volume)) %>%
pander()| Mean | Median |
|---|---|
| 37.19 | 33 |
dfs %>%
ggplot(aes(x = volume, y = difficulty)) +
geom_jitter(size = 0.1, alpha = 0.1, height = 0.08, width = 0.3) +
scale_x_log10(labels = comma) +
geom_smooth(method='lm', formula= y~x) +
labs(title = "Keyword difficulty and volume")For each doubling of volume, the difficulty increases by 1.63:
##
## Call:
## lm(formula = difficulty ~ log2(volume), data = dfs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.405 -12.575 -7.412 6.201 87.562
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.83205 0.50911 3.598 0.000321 ***
## log2(volume) 1.62619 0.06295 25.834 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.45 on 16063 degrees of freedom
## (3935 observations deleted due to missingness)
## Multiple R-squared: 0.03989, Adjusted R-squared: 0.03983
## F-statistic: 667.4 on 1 and 16063 DF, p-value: < 2.2e-16
dfs %>% drop_na(difficulty_cat) %>%
ggplot(aes(y = volume, x = difficulty_cat)) +
geom_violin(draw_quantiles = c(0.5)) +
scale_y_log10(labels = comma) +
labs(x = "Difficulty category", title = "Keyword difficulty and volume")People will most likely not understand a violin chart. Alternatives? box blot?
!!!J: In my experience people understand violin charts equally well as box charts, and I have shown them quite a few times. It is almost a box chart, with a median, and some area above and below. So I think we can keep it. If you insist, it’s an easy change to make box charts instead. Although I do think they are not really needed here, since the scatter plots are good.
dfs %>%
ggplot(aes(x = difficulty, y = cpc)) +
geom_jitter(size = 0.1, alpha = 0.1, height = 0.08, width = 0.3) +
scale_y_log10(labels = comma) +
geom_smooth(method='lm', formula= y~x) +
labs(title = "Keyword difficulty and cpc")dfs %>% drop_na(difficulty_cat) %>%
ggplot(aes(y = cpc, x = difficulty_cat)) +
geom_violin(draw_quantiles = c(0.5)) +
scale_y_log10(labels = comma) +
labs(x = "Difficulty category", title = "Keyword difficulty and cpc")!!!D: similar comments as above.
!!!D: Shame we cannot use the keyword categories here. May be we can try to solve this after the 3rd of November if time remains. Just curios how large the df would be if we select just the columns keyword and info_categories from the original data set. We could do on the google big query page to avoid that rstudio crashes (plus, apply some filters). Just curios to know: You probably have thought of running a left join but why is that not possible?
!!!J: How would you do that exactly in practice? Like, what do I left join on, concretely? I know, the keywords from here. But how do I get that list into the database or the SQL command? Not saying it’s not possible, Im just not sure how to do it.
Note there are (at least) two additional SERP feature types, knowledge panel and videos, for which the sample size is too small to be included.
dff <- dfs %>%
select(keyword, volume, clicks, cpc, serp_features, cps) %>%
separate_rows(serp_features, sep = ",") %>%
mutate(serp_features = ifelse(is.na(serp_features), "(None)", serp_features)) %>%
filter(!(serp_features %in% c("Videos", "Knowledge panel")))
nones <- dff %>% filter(serp_features == "(None)")
dffn <- dff %>% group_by(keyword) %>%
summarise(n_serp = n()) %>%
mutate(n_serp = ifelse(keyword %in% nones$keyword, 0, n_serp)) %>%
mutate(n_serp = ifelse(n_serp >= 6, "6+", as.character(n_serp))) %>%
mutate(n_serp = factor(n_serp, levels = c("0", "1", "2", "3", "4", "5", "6+")))
dffn <- left_join(dffn, dfs, by = "keyword")In sample:
dff %>% group_by(serp_features) %>%
summarise(prop = n() / nrow(dfs)) %>%
ggplot(aes(y = reorder(serp_features, prop), x = prop)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
scale_x_continuous(labels = scales::percent) +
labs(y = "SERP feature", x = "", title = "Presence of SERP features")rdff <- rdf %>%
select(keyword, volume, clicks, cpc, serp_features, cps) %>%
separate_rows(serp_features, sep = ",") %>%
mutate(serp_features = ifelse(is.na(serp_features), "(None)", serp_features)) %>%
filter(!(serp_features %in% c("Videos", "Knowledge panel")))
nones <- rdff %>% filter(serp_features == "(None)")
rdffn <- rdff %>% group_by(keyword) %>%
summarise(n_serp = n()) %>%
mutate(n_serp = ifelse(keyword %in% nones$keyword, 0, n_serp)) %>%
mutate(n_serp = ifelse(n_serp >= 6, "6+", as.character(n_serp))) %>%
mutate(n_serp = factor(n_serp, levels = c("0", "1", "2", "3", "4", "5", "6+")))
rdffn <- left_join(dffn %>% select(keyword, n_serp), dfs, by = "keyword")Representative:
rdff %>% group_by(serp_features) %>%
summarise(prop = n() / nrow(rdff)) %>%
ggplot(aes(y = reorder(serp_features, prop), x = prop)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
scale_x_continuous(labels = scales::percent) +
labs(y = "SERP feature", x = "", title = "Presence of SERP features")By volume:
rdff %>% group_by(serp_features) %>%
summarise(prop = sum(volume) / sum(rdff$volume)) %>%
ggplot(aes(y = reorder(serp_features, prop), x = prop)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
scale_x_continuous(labels = scales::percent) +
labs(y = "SERP feature", x = "", title = "Presence of SERP features")In sample:
dffn %>% group_by(n_serp) %>%
summarise(n = n() / nrow(dffn)) %>%
ggplot(aes(x = n_serp, y = n)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
labs(x = "Number of serp features", y = "", title = "Distribution of SERP features") +
scale_y_continuous(labels = scales::percent)By volume:
rdffn %>% group_by(n_serp) %>%
summarise(prop = sum(volume) / sum(rdff$volume)) %>%
ggplot(aes(x = n_serp, y = prop)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
labs(x = "Number of serp features", y = "", title = "Distribution of SERP features") +
scale_y_continuous(labels = scales::percent)The knowledge card has a huge effect in reducing the cps, while the other SERP features have limited effect. Searches with the Shopping results SERP feature have higher cps on average.
order <- dff %>%
group_by(serp_features) %>%
summarise(mean_cps = mean(cps, na.rm = T)) %>%
arrange(mean_cps) %>%
pull(serp_features)
dff %>%
mutate(serp_features = factor(serp_features, levels = order)) %>%
ggplot(aes(y = serp_features, x = cps)) +
stat_density_ridges(fill = "turquoise4", color = "black") +
labs(y = "SERP feature", title = "SERP features and cps")order <- dff %>%
group_by(serp_features) %>%
summarise(mean_volume = mean(volume, na.rm = T)) %>%
arrange(mean_volume) %>%
pull(serp_features)
dff %>%
mutate(serp_features = factor(serp_features, levels = order)) %>%
ggplot(aes(y = serp_features, x = volume)) +
scale_x_log10(labels = comma) +
stat_density_ridges(fill = "turquoise4", color = "black") +
labs(y = "SERP feature", title = "SERP features and cps")Low difficulty keywords have fewer SERP features
rdffn %>% mutate(n_serp = as.numeric(n_serp)) %>%
drop_na(difficulty_cat) %>%
ggplot(aes(x = difficulty_cat, y = n_serp)) +
geom_boxplot() +
labs(x = "Difficulty", y = "SERP features", title = "Difficulty and number of SERP features")rdffn %>% group_by(n_serp) %>%
summarise(difficulty = mean(difficulty, na.rm = T)) %>%
ggplot(aes(x = n_serp, y = difficulty)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black", width = 0.8) +
labs(x = "SERP features", title = "Number of SERP features and mean difficulty", y = "Difficulty")find_pairs <- function(){
rs <- rdff %>% filter(serp_features != "(None)") %>%
sample_n(20000)
kw <- rs %>% distinct(keyword) %>% pull(keyword)
get_table <- function(k){
a <- rs %>% filter(keyword == k, serp_features != "(None)")
crossing(v1 = a$serp_features, v2= a$serp_features) %>% filter(v1 < v2)
}
pairs <- map_dfr(kw, get_table) %>%
group_by(v1, v2) %>%
summarise(n = n())
pairs %>% write_csv("../proc_data/serp_pairs.csv")
}
pairs <- read_csv("../proc_data/serp_pairs.csv") %>%
mutate(n = n / sum(n))
pairs %>%
arrange(desc(n)) %>%
head(10) %>%
mutate(pair = glue("{v1} & {v2}")) %>%
ggplot(aes(y = reorder(pair, n), x = n)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black") +
labs(x = "", y = "SERP feature pair", title = "Most common SERP feature pairings") +
scale_x_continuous(labels = scales::percent)Definition SERP feautres: A SERP feature is any result on a Google Search Engine Results Page (SERP) that is not a traditional organic result. The most common SERP Features are: Rich Snippets which add a visual layer to an existing result (e.g., review stars for product ratings).
!!!D: after the 3rd, may make sense to check if there is anything interesting on relationship between keyword category and SERP features.
Some definitions:
The Clicks column shows exactly how many times per month people tend to click any pages when googling this keyword. Some searches result in a lot of clicks, while other high search volume keywords may not bring in as much traffic from search due to the low number of clicks.
The keyword “chauffeur” has a high search volume of 67,000 searches per month. Yet that volume only resulted in 13,406 clicks. One probable reason could be that Google already gave what people wanted instantly - and there was no need to click on the search results.
!!!D: CPS column definition
The CPS (Click per Search) shows an average number of clicks for all searches. It is basically a correlation between the Clicks metric and the Search Volume of the keyword.
In the example given, people search for “wow chauffeur” less frequently than “chauffeur”, yet the keyword has more Clicks than Searches.
Further investigation reveals that the word “wow” actually stands for “World of Warcraft”, and apparently, people are looking for information on how to summon a “chauffeur” in the game. That makes for a completely different search intent.
And this is why we have the CPS metric.
The higher the CPS (i.e people clicking on a few links to satisfy their search query) – the more chances that you’ll get some traffic even if you’re not ranking #1 for that search query.
More info: https://help.ahrefs.com/en/articles/624151-what-does-clicks-stand-for-in-keywords-explorer
dfv <-
bind_rows(
df %>% mutate(region = "US"),
df %>% mutate(volume = global_volume - volume, region = "International")
)In ahref, international volume is higher:
dfv %>% group_by(region) %>%
summarise(volume = sum(volume, na.rm = T)) %>%
mutate(volume = scales::percent(volume / sum(volume))) %>%
pander()| region | volume |
|---|---|
| International | 67% |
| US | 33% |
However, in the original data set, US volume is much higher:
## # A tibble: 2 x 2
## region volume
## <chr> <chr>
## 1 US 82%
## 2 International 19%
I will be going by ahref in the following.
Internationally there are more searches with very low volume, while US has more searches with medium volume.
dfv %>% drop_na(volume) %>%
mutate(volume_group = case_when(volume < 100 ~ "< 100",
between(volume, 100, 1000) ~ "100 - 1000",
between(volume, 1000, 10000) ~ "1000 - 10,000",
volume > 10000 ~ "10,000 +")) %>%
mutate(volume_group = factor(volume_group, levels = c("< 100", "100 - 1000", "1000 - 10,000", "10,000 +"))) %>%
group_by(volume_group, region) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(n = n / sum(n)) %>%
ggplot(aes(x = volume_group, y = n, fill = region)) +
geom_bar(stat = "identity", position = position_dodge(), width = 0.8, color = "black") +
labs(fill = "Region", x = "Volume", y = "") +
scale_y_continuous(labels = scales::percent, expand = c(0,0)) There is not a large difference in the number of searches with very high volume. However, the total volume of these searches is a lot higher internationally
dfv %>% drop_na(volume) %>%
mutate(volume_group = case_when(volume < 100 ~ "< 100",
between(volume, 100, 1000) ~ "100 - 1000",
between(volume, 1000, 10000) ~ "1000 - 10,000",
between(volume, 10000, 100000) ~ "10,000 - 100,000",
between(volume, 100000, 1000000) ~ "100,000 - 1M",
volume > 1000000 ~ "1M +")) %>%
mutate(volume_group = factor(volume_group, levels = c("< 100", "100 - 1000", "1000 - 10,000", "10,000 - 100,000", "100,000 - 1M", "1M +"))) %>%
group_by(volume_group, region) %>%
summarise(n = sum(volume)) %>%
ungroup() %>%
mutate(n = n / sum(n)) %>%
ggplot(aes(x = volume_group, y = n, fill = region)) +
geom_bar(stat = "identity", position = position_dodge(), width = 0.8, color = "black") +
labs(fill = "Region", x = "Volume", y = "") +
scale_y_continuous(labels = comma, expand = c(0,0))df_int <- df %>% mutate(international_volume = global_volume - volume) %>%
filter(international_volume > 0, global_volume > 0) %>%
mutate(volume_diff = log10(international_volume) - log10(volume))
df_int_s <- df_int %>% sample_n(200000)
df_int_s %>%
ggplot(aes(x = volume, y = international_volume)) +
geom_jitter(size = 0.05, alpha = 0.02, height = 0.15, width = 0.15) +
scale_x_log10(labels = comma, expand = c(0,0)) +
scale_y_log10(labels = comma, expand = c(0,0)) +
geom_abline(intercept = 0, slope = 1, color = "turquoise4", size = 1) +
labs(x = "US volume", y = "International volume")We can see that they mostly follow each other, but there are some searches with large difference between them.
Higher volume internationally:
tbl <- df_int %>%
arrange(desc(volume_diff)) %>%
filter(volume != 60) %>%
select(keyword, us_volume = volume, international_volume) %>%
head(5)
tbl %>% write_csv("../plots/csv/table_int.csv")
tbl %>% pander()| keyword | us_volume | international_volume |
|---|---|---|
| filmoviplex | 10 | 295990 |
| cloroquina | 200 | 5869800 |
| parivahan sewa | 10 | 276990 |
| jokaroom | 10 | 173990 |
| handball em | 20 | 327980 |
Higher volume in US:
tbl <- df_int %>%
arrange(volume_diff) %>%
select(keyword, us_volume = volume, international_volume) %>%
head(5)
tbl %>% write_csv("../plots/csv/table_us.csv")
tbl %>% pander()| keyword | us_volume | international_volume |
|---|---|---|
| football playoff schedule | 602000 | 1000 |
| frontier mail | 586000 | 1000 |
| spectrum mobile | 526000 | 1000 |
| chase bank near me | 523000 | 1000 |
| spectrum internet | 998000 | 2000 |
Searches that have higher volume in US have a higher click-per-search on average than searches that have higher volume internationally.
df_int_s %>%
filter(cps < 5) %>%
ggplot(aes(x = volume_diff, y = cps)) +
geom_point(alpha = 0.02, size = 0.02) +
geom_smooth(method='lm', formula= y~x) +
scale_x_continuous(breaks = c(-2, 0, 3), labels = c("More US", "0", "More international")) +
labs(x = "")
They also have a higher cost-per-click on average
df_int_s %>%
ggplot(aes(x = volume_diff, y = cpc)) +
geom_point(alpha = 0.08, size = 0.08) +
scale_y_log10(labels = comma) +
geom_smooth(method='lm', formula= y~x) +
scale_x_continuous(breaks = c(-2, 0, 3), labels = c("More US", "0", "More international")) +
labs(x = "")
Searches that have higher volume internationally, tend to have higher difficulty
df_int_s %>%
ggplot(aes(x = volume_diff, y = difficulty)) +
geom_point(alpha = 0.08, size = 0.08) +
#scale_y_log10(labels = comma) +
geom_smooth(method='lm', formula= y~x) +
scale_x_continuous(breaks = c(-2, 0, 3), labels = c("More US", "0", "More international")) +
labs(x = "")In sample:
tribble(~Mean, ~Median,
round(mean(df$clicks, na.rm = T), 2), median(df$clicks, na.rm = T)) %>%
pander()| Mean | Median |
|---|---|
| 3036 | 306 |
In representative sample:
tribble(~Mean, ~Median,
round(mean(rdf$clicks, na.rm = T), 2), median(rdf$clicks, na.rm = T)) %>%
pander()| Mean | Median |
|---|---|
| 2150 | 261 |
By volume
tribble(~Mean, ~Median,
round(weighted.mean(rdf$clicks, rdf$volume, na.rm = T), 2), weighted.median(rdf$clicks, rdf$volume, na.rm = T)) %>%
pander()| Mean | Median |
|---|---|
| 397516 | 20913 |
log_mean <- 10 ^ (df %>% mutate(clicks = clicks + 1) %>%
mutate(log_clicks = log10(clicks)) %>%
summarise(m = mean(log_clicks, na.rm = T)) %>%
pull(m))
df %>% ggplot(aes(x = clicks)) +
geom_histogram(fill = "turquoise4", color = "black") +
scale_x_log10(labels = comma) +
scale_y_continuous(limits = c(0, 250000), expand = c(0,0)) +
labs(title = "Distribution of number of clicks", y = "", x = "") +
geom_vline(xintercept = log_mean, linetype = "dashed", color = "blue", size = 1) +
ggeasy::easy_remove_y_axis() Note that this is in sample, so the lowest part of the distribution is not included. Probably does not really make sense.
Comparison of searches with same volume but different return rates:
tbl <- bind_rows(
df %>% filter(return_rate > 10) %>%
summarise(mean_cpc = mean(cpc, na.rm = T), mean_clicks = mean(clicks, na.rm = T), mean_cpc = mean(cps, na.rm = T), mean_difficulty = mean(difficulty, na.rm = T)) %>%
mutate(return_rate = "very high") %>% relocate(return_rate),
df %>% filter(return_rate > 10) %>%
select(number, volume) %>%
left_join(df %>% filter(return_rate < 10), by = c("number", "volume")) %>%
distinct(number, volume, cat, .keep_all = T) %>%
summarise(mean_cpc = mean(cpc, na.rm = T), mean_clicks = mean(clicks, na.rm = T), mean_cpc = mean(cps, na.rm = T), mean_difficulty = mean(difficulty, na.rm = T)) %>%
mutate(return_rate = "low") %>% relocate(return_rate)
)
tbl %<>% mutate(mean_cpc = round(mean_cpc, 2), mean_clicks = round(mean_clicks, 0), mean_difficulty = round(mean_difficulty, 1))
tbl %>% write_csv("../plots/csv/return_rate.csv")
tbl %>% pander()| return_rate | mean_cpc | mean_clicks | mean_difficulty |
|---|---|---|---|
| very high | 0.96 | 71423 | 18.4 |
| low | 0.7 | 15094 | 25.6 |
We can see that searches with high return rates tend to have lower difficulty, and to be clicked on a lot more.